{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ensemble learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_names = [\n",
    "    'vgg16-keras', \n",
    "    'vgg19-keras', \n",
    "    'resnet50-keras',\n",
    "    'incv3-keras',   \n",
    "    'Inception_v3'\n",
    "]\n",
    "\n",
    "# out best classifiers build on top of pretrained models\n",
    "classifier_filepath = {\n",
    "    'incv3-keras'    : 'classifiers/7577-incv3-keras.pkl',\n",
    "    'vgg16-keras'    : 'classifiers/8515-vgg16-keras.pkl',\n",
    "    'vgg19-keras'    : 'classifiers/8654-vgg19-keras.pkl',\n",
    "    'Inception_v3'   : 'classifiers/9061-Inception_v3.pkl',\n",
    "    'resnet50-keras' : 'classifiers/9158-resnet50-keras.pkl'\n",
    "}\n",
    "\n",
    "import numpy as np\n",
    "data = dict()\n",
    "for model_name in model_names:\n",
    "    data[model_name] = np.load('features/CIFAR10_{model}_features.npz'.format(model=model_name)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "classifiers/7577-incv3-keras.pkl\r\n",
      "classifiers/8515-vgg16-keras.pkl\r\n",
      "classifiers/8654-vgg19-keras.pkl\r\n",
      "classifiers/9061-Inception_v3.pkl\r\n",
      "classifiers/9158-resnet50-keras.pkl\r\n"
     ]
    }
   ],
   "source": [
    "!ls -1 classifiers/*.pkl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.externals import joblib\n",
    "\n",
    "clf = dict()\n",
    "for name in model_names:\n",
    "    clf[name] = joblib.load(classifier_filepath[name])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'C': 0.001,\n",
       " 'class_weight': None,\n",
       " 'dual': True,\n",
       " 'fit_intercept': True,\n",
       " 'intercept_scaling': 1,\n",
       " 'loss': 'squared_hinge',\n",
       " 'max_iter': 1000,\n",
       " 'multi_class': 'ovr',\n",
       " 'penalty': 'l2',\n",
       " 'random_state': None,\n",
       " 'tol': 0.0001,\n",
       " 'verbose': 0}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf['resnet50-keras'].get_params()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us remind the classifiers scores on training dataset from CIFAR10\n",
    "\n",
    "    incv3    => 7577\n",
    "    vgg16    => 8515\n",
    "    vgg19    => 8654\n",
    "    Incv3    => 9061\n",
    "    resnet50 => 9158"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Majority voting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import myutils\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "_, data_testing = myutils.load_CIFAR_dataset(shuffle=False)\n",
    "\n",
    "y_testing = np.array( data_testing )[:,1]\n",
    "n_testing = y_testing.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "def majority_vote(i):\n",
    "    votes = np.zeros(10);\n",
    "    for name in model_names:\n",
    "        y = y_predictions[name][i]\n",
    "        votes[y] += 1\n",
    "    return votes.argmax()\n",
    "\n",
    "y_predictions = dict()\n",
    "for name in model_names:\n",
    "    y_predictions[name] = clf[name].predict( data[name]['features_testing'] )\n",
    "    \n",
    "y_ensembled = [ majority_vote(i) for i in range(n_testing) ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9184"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(y_ensembled == y_testing)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple majority voting increases our best result to **91.84%**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Weighted voting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9284"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Assume, we know how good are our models. We can give some weight to their votes.\n",
    "classifier_weights = {\n",
    "    'incv3-keras'    : 2,\n",
    "    'vgg16-keras'    : 3,\n",
    "    'vgg19-keras'    : 4,\n",
    "    'Inception_v3'   : 7,\n",
    "    'resnet50-keras' : 7\n",
    "}\n",
    "\n",
    "def weighted_vote(i):\n",
    "    votes = np.zeros(10);\n",
    "    for name in model_names:\n",
    "        y = y_predictions[name][i]\n",
    "        votes[y] = votes[y] + classifier_weights[name]\n",
    "    return votes.argmax()\n",
    "\n",
    "y_ensembled = [ weighted_vote(i) for i in range(n_testing) ]\n",
    "\n",
    "np.sum( y_ensembled == y_testing )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dynamic voting\n",
    "Another idea is as follows\n",
    "1. Take the image to classify\n",
    "2. Find its K nearest neighbors\n",
    "3. Classify the neighbor using all classifiers\n",
    "4. Use weighted voting"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adaboost"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In case of our classiers we cannot use boosting techniques.\n",
    "However, one can try with `svm.SVC(kernel='linear', probability=True)`\n",
    "in order to boost the model. \n",
    "```python\n",
    "#  base_clf = clf['resnet50-keras']\n",
    "from sklearn import svm\n",
    "base_clf = svm.SVC(probability=True,kernel='linear')\n",
    "\n",
    "n_training = 1000 # TODO: try with bigger testing data\n",
    "\n",
    "X_train = data['resnet50-keras']['features_training'][:n_training]\n",
    "y_train = data['resnet50-keras']['labels_training'][:n_training]\n",
    "\n",
    "base_clf.fit( X_train, y_train )\n",
    "base_clf.score( data['resnet50-keras']['features_testing'], data['resnet50-keras']['labels_testing'] )\n",
    "\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "boosted_model = AdaBoostClassifier(base_estimator = base_clf)\n",
    "boosted_model.fit( X_train, y_train )\n",
    "boosted_model.score( data['resnet50-keras']['features_testing'], data['resnet50-keras']['labels_testing'] )\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Adaboost did not boost.\n",
    "<pre>\n",
    "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
    "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
    "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
    "  tol=0.001, verbose=False)\n",
    "  \n",
    "0.83030000000000004\n",
    "\n",
    "AdaBoostClassifier(algorithm='SAMME.R',\n",
    "          base_estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
    "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',\n",
    "  max_iter=-1, probability=True, random_state=None, shrinking=True,\n",
    "  tol=0.001, verbose=False),\n",
    "          learning_rate=1.0, n_estimators=50, random_state=None)\n",
    "          \n",
    "0.78369999999999995\n",
    "</pre>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model stacking\n",
    "Here we present the reults with stacking our two best models.\n",
    "One can try with stacking more models."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_params = {\n",
    "    'vgg16-keras':    [ {'C':0.0001} ],\n",
    "    'vgg19-keras':    [ {'C':0.001}  ],\n",
    "    'resnet50-keras': [ {'C':0.001}   ],\n",
    "    'Inception_v3':   [ {'C':0.01}   ],\n",
    "    'incv3-keras':    [ {'C':0.001}  ]\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So we choose `Inception v3` from Tensorflow and `ResNET50` from <tt>keras.applications</tt>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model1_name = 'Inception_v3'\n",
    "model2_name = 'resnet50-keras'\n",
    "n_models   = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import LinearSVC\n",
    "model1 = LinearSVC( **model_params[model1_name][0] )\n",
    "model2 = LinearSVC( **model_params[model2_name][0] )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We are not looking at the testing data from CIFAR10 database, yet."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X1_training = data[model1_name]['features_training']\n",
    "y1_training = data[model1_name]['labels_training']\n",
    "\n",
    "X2_training = data[model2_name]['features_training']\n",
    "y2_training = data[model2_name]['labels_training']\n",
    "\n",
    "n_training = X1_training.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We build training data for the stacked model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_cross_training = np.zeros( (n_training, n_models) )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((50000, 2048), (50000, 2048))"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X1_training.shape,  X2_training.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# y_cross_training[ [3,4,5], 0 ] = [ 6,7,2 ]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's build `X_cross_training`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [10000 10001 10002 ..., 49997 49998 49999] TEST: [   0    1    2 ..., 9997 9998 9999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [10000 10001 10002 ..., 19997 19998 19999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [20000 20001 20002 ..., 29997 29998 29999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [30000 30001 30002 ..., 39997 39998 39999]\n",
      "TRAIN: [    0     1     2 ..., 39997 39998 39999] TEST: [40000 40001 40002 ..., 49997 49998 49999]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import KFold\n",
    "kf = KFold(n_splits=5, shuffle=False)\n",
    "\n",
    "X = X1_training\n",
    "y = y1_training\n",
    "for train_index, test_index in kf.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "    model1.fit( X_train, y_train )\n",
    "    y_predict = model1.predict( X_test )\n",
    "    X_cross_training[test_index,0] = y_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TRAIN: [10000 10001 10002 ..., 49997 49998 49999] TEST: [   0    1    2 ..., 9997 9998 9999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [10000 10001 10002 ..., 19997 19998 19999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [20000 20001 20002 ..., 29997 29998 29999]\n",
      "TRAIN: [    0     1     2 ..., 49997 49998 49999] TEST: [30000 30001 30002 ..., 39997 39998 39999]\n",
      "TRAIN: [    0     1     2 ..., 39997 39998 39999] TEST: [40000 40001 40002 ..., 49997 49998 49999]\n"
     ]
    }
   ],
   "source": [
    "X = X2_training\n",
    "y = y2_training\n",
    "for train_index, test_index in kf.split(X):\n",
    "    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\n",
    "    X_train, X_test = X[train_index], X[test_index]\n",
    "    y_train, y_test = y[train_index], y[test_index]\n",
    "    model2.fit( X_train, y_train )\n",
    "    y_predict = model1.predict( X_test )\n",
    "    X_cross_training[test_index,1] = y_predict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 6.,  6.],\n",
       "       [ 9.,  6.],\n",
       "       [ 8.,  6.],\n",
       "       [ 4.,  2.],\n",
       "       [ 1.,  1.],\n",
       "       [ 1.,  1.],\n",
       "       [ 2.,  4.],\n",
       "       [ 7.,  1.],\n",
       "       [ 8.,  6.],\n",
       "       [ 3.,  1.]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_cross_training[:10,0:n_models]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVR, SVC\n",
    "stacked_model = SVC()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
       "  max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
       "  tol=0.001, verbose=False)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Training stacked model\n",
    "# One can include some training features from model1 and model1   # TODO\n",
    "y_training = data[model2_name]['labels_training']  # one can take any model\n",
    "stacked_model.fit( X_cross_training, y_training )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model1.fit( X1_training , y1_training )\n",
    "X1_testing = data[model1_name]['features_testing']\n",
    "y1_predictions = model1.predict( X1_testing )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model2.fit( X2_training , y2_training )\n",
    "X2_testing = data[model2_name]['features_testing']\n",
    "y2_predictions = model2.predict( X2_testing )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([3, 8, 8, ..., 5, 0, 7]), array([3, 8, 8, ..., 5, 1, 7], dtype=uint8))"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y1_predictions, y2_predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_predictions = stacked_model.predict( np.column_stack( (y1_predictions,y2_predictions) ) )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is the moment to look at the testing data from CIFAR10 database."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_testing = data[model2_name]['labels_testing']  # one can take any model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "9061"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum( y_predictions == y_testing )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Stacking model accuracy is **90.61%**, which is not better than the accuracy gained by the base models."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let us remind the accuracies of the base models:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.90610000000000002, 0.91579999999999995)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model1.score( X1_testing, y_testing ),  model2.score( X2_testing, y_testing )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
